{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "uxRWtMgpJcZR"
   },
   "source": [
    "In order to get the run times using google colabs, please follow the steps below:\n",
    "\n",
    "1- connect to a GPU environment on google colab \n",
    "\n",
    "2- upload \"data_clean\\labels.csv\" and \"data_clean\\text.csv\" to the cloud environment.\n",
    "\n",
    "3- install the scikit-multilearn using the command \"!pip install scikit-multilearn\"\n",
    "\n",
    "4- run the cells below"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "id": "7qj1AprPVG9w"
   },
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
    "\n",
    "\n",
    "models_dict = {'S6_is_formal': RandomForestClassifier(n_estimators=100),\n",
    "               'S6_is_legal': RandomForestClassifier(n_estimators=1000),\n",
    "               'S6_is_technical': RandomForestClassifier(n_estimators=500),\n",
    "               'S6_is_aggressive': RandomForestClassifier(n_estimators=100),\n",
    "               'S8_dummy_Activities': GradientBoostingClassifier(\n",
    "                n_estimators=100),\n",
    "               'S8_dummy_Budget': RandomForestClassifier(n_estimators=500),\n",
    "               'S8_dummy_Evaluation': RandomForestClassifier(\n",
    "                n_estimators=1000),\n",
    "               'S8_dummy_ExternalContracts': RandomForestClassifier(\n",
    "                n_estimators=1000),\n",
    "               'S8_dummy_InstStruc': RandomForestClassifier(\n",
    "                n_estimators=1000),\n",
    "               'S8_dummy_Other': RandomForestClassifier(\n",
    "                n_estimators=500),\n",
    "               'S8_dummy_Regulatory': RandomForestClassifier(\n",
    "                n_estimators=1000),\n",
    "               'S9_dummy_Academic/Scholarly': RandomForestClassifier(\n",
    "                n_estimators=500),\n",
    "               'S9_dummy_Commercial': RandomForestClassifier(\n",
    "                n_estimators=500),\n",
    "               'S9_dummy_Impossible to say': RandomForestClassifier(\n",
    "                n_estimators=1000),\n",
    "               'S9_dummy_Monitoring': RandomForestClassifier(\n",
    "                n_estimators=500),\n",
    "               'S9_dummy_Personal': RandomForestClassifier(n_estimators=500),\n",
    "               'S10_is_clear': RandomForestClassifier(n_estimators=1000),\n",
    "               'S10_is_competency_of_institution': RandomForestClassifier(\n",
    "                n_estimators=100),\n",
    "               'S10_is_public': RandomForestClassifier(n_estimators=500),\n",
    "               'S10_is_existant': RandomForestClassifier(n_estimators=100),\n",
    "               'S11_dummy_Date': RandomForestClassifier(n_estimators=500),\n",
    "               'S11_dummy_Document': RandomForestClassifier(\n",
    "                n_estimators=1000),\n",
    "               'S11_dummy_Institution': RandomForestClassifier(\n",
    "                n_estimators=1000),\n",
    "               'S11_dummy_Organization': RandomForestClassifier(\n",
    "                n_estimators=100),\n",
    "               'S11_dummy_Person': RandomForestClassifier(n_estimators=1000),\n",
    "               'S11_dummy_Place': RandomForestClassifier(n_estimators=500)}\n",
    "\n",
    "\n",
    "opt_thresholds = {'S6_is_formal': 0.21,\n",
    "               'S6_is_legal': 0.22,\n",
    "               'S6_is_technical': 0.22,\n",
    "               'S6_is_aggressive': 0.18,\n",
    "               'S8_dummy_Activities': 0.32,\n",
    "               'S8_dummy_Budget': 0.26,\n",
    "               'S8_dummy_Evaluation': 0.23,\n",
    "               'S8_dummy_ExternalContracts': 0.31,\n",
    "               'S8_dummy_InstStruc': 0.32,\n",
    "               'S8_dummy_Other': 0.04,\n",
    "               'S8_dummy_Regulatory': 0.20,\n",
    "               'S9_dummy_Academic/Scholarly': 0.20,\n",
    "               'S9_dummy_Commercial': 0.17,\n",
    "               'S9_dummy_Impossible to say': 0.33,\n",
    "               'S9_dummy_Monitoring': 0.36,\n",
    "               'S9_dummy_Personal': 0.19,\n",
    "               'S10_is_clear': 0.50,\n",
    "               'S10_is_competency_of_institution': 0.50,\n",
    "               'S10_is_public': 0.50,\n",
    "               'S10_is_existant': 0.50,\n",
    "               'S11_dummy_Date': 0.21,\n",
    "               'S11_dummy_Document': 0.26,\n",
    "               'S11_dummy_Institution': 0.23,\n",
    "               'S11_dummy_Organization': 0.20,\n",
    "               'S11_dummy_Person': 0.23,\n",
    "               'S11_dummy_Place': 0.21}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "m0236Cd0UgSs"
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import time\n",
    "from scipy.sparse import load_npz\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "import sklearn.metrics as metrics\n",
    "from sklearn.svm import SVC\n",
    "from imblearn.over_sampling import SMOTE\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.multiclass import OneVsRestClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
    "from skmultilearn.problem_transform import ClassifierChain, BinaryRelevance\n",
    "from copy import deepcopy\n",
    "from math import ceil\n",
    "from skmultilearn.ensemble import RakelD\n",
    "import sys\n",
    "import re\n",
    "import string\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
    "from tensorflow.keras.layers import Embedding, LSTM, Dense\n",
    "from tensorflow.keras.layers import Dropout, Bidirectional\n",
    "from tensorflow.keras.preprocessing.text import Tokenizer\n",
    "from tensorflow.keras.models import Sequential\n",
    "from tensorflow.keras.optimizers import Adam\n",
    "from skmultilearn.adapt import MLkNN\n",
    "import tensorflow as tf\n",
    "\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')\n",
    "\n",
    "\n",
    "def tokenize(s):\n",
    "    return re_tok.sub(r' \\1 ', s).split()\n",
    "\n",
    "\n",
    "def run_ECC(\n",
    "        X, y_all, label_cols,\n",
    "        function_name, seeds=list(range(2)), split_size=0.2):\n",
    "    \"\"\"\n",
    "    Runs Ensemble Classifier Chain and saves results in file\n",
    "    Inputs\n",
    "        X : vectorized cleaned text of requests\n",
    "        y_all : target labels (train and test)\n",
    "        label_cols : list of labels' names\n",
    "        function_name : name of the function\n",
    "        seeds : seeds to generate random numbers (optional)\n",
    "        split_size : percentage of test data (optional, default = 0.2)\n",
    "    \"\"\"\n",
    "    temp = []\n",
    "    num_test_req = ceil(split_size * y_all.shape[0])\n",
    "    y_all = y_all[label_cols]\n",
    "    for i, seed in enumerate(seeds):\n",
    "\n",
    "        all_acc = {}\n",
    "        print(f'Iteration {i+1} of {len(seeds)}')\n",
    "        print(f'Running {function_name}')\n",
    "        outputs_list = []\n",
    "        for k in range(5):\n",
    "            permute = np.random.permutation(len(label_cols))\n",
    "            reorder = np.argsort(permute)\n",
    "\n",
    "            model = ClassifierChain(LogisticRegression(\n",
    "                C=1, solver='lbfgs', max_iter=500))\n",
    "\n",
    "            X_train, X_test, y_train, y_test = train_test_split(\n",
    "                X, y_all.iloc[:, permute].values, test_size=split_size,\n",
    "                random_state=seed)\n",
    "          \n",
    "            vectorizer = TfidfVectorizer(\n",
    "                        ngram_range=(1, 2),\n",
    "                        tokenizer=tokenize,\n",
    "                        encoding='utf-8',\n",
    "                        use_idf=True,\n",
    "                        smooth_idf=True,\n",
    "                        max_features=None,\n",
    "                        norm='l2',\n",
    "                        max_df=0.5,\n",
    "                        min_df=0.01, strip_accents='unicode',\n",
    "                        sublinear_tf=1\n",
    "            )\n",
    "\n",
    "            X_train = vectorizer.fit_transform(X_train)\n",
    "            X_test = vectorizer.transform(X_test)\n",
    "\n",
    "            model.fit(X_train, y_train)\n",
    "            y_pred = model.predict(X_test)\n",
    "            pred_probs = model.predict_proba(X_test).A\n",
    "\n",
    "            outputs_list.append(pred_probs[:, reorder])\n",
    "\n",
    "        X_train, X_test, y_train, y_test = train_test_split(\n",
    "                X, y_all[label_cols].values, test_size=split_size,\n",
    "                random_state=seed)\n",
    "\n",
    "        ens_out = np.mean(outputs_list, axis=0)\n",
    "        labels_out = 1 * (ens_out > 0.5)\n",
    "\n",
    "        all_acc['acc'] = 100*metrics.accuracy_score(y_test, labels_out)\n",
    "        all_acc['micro'] = 100*metrics.f1_score(\n",
    "            y_test, labels_out, average='micro')\n",
    "        all_acc['macro'] = 100*metrics.f1_score(\n",
    "            y_test, labels_out, average='macro')\n",
    "        all_acc['hamming'] = 100*metrics.hamming_loss(y_test, labels_out)\n",
    "        all_acc['r_loss'] = metrics.label_ranking_loss(y_test, ens_out)\n",
    "        temp.append(all_acc)\n",
    "\n",
    "    df_final = pd.DataFrame(temp)\n",
    "    print(f'Metrics : \\n{df_final.mean(axis=0)}')\n",
    "    print(f'Std : \\n{df_final.std(axis=0)}')\n",
    "    df_final.to_csv(f'{function_name}.csv')\n",
    "\n",
    "\n",
    "def run_BR_opt_models(\n",
    "        X, y_all, models_dict, label_cols,\n",
    "        function_name, seeds=list(range(2)), split_size=0.2):\n",
    "    \"\"\"\n",
    "    Runs Binary Relevance models and saves results in file\n",
    "    Inputs\n",
    "        X : vectorized cleaned text of requests\n",
    "        y_all : target labels (train and test)\n",
    "        models_dict : dictionary with each model and its hyperparemeters\n",
    "        label_cols : list of labels' names\n",
    "        function_name : name of the function\n",
    "        seeds : seeds to generate random numbers (optional)\n",
    "        split_size : percentage of test data (optional, default = 0.2)\n",
    "    \"\"\"\n",
    "    temp = []\n",
    "    num_test_req = ceil(split_size * y_all.shape[0])\n",
    "    for i, seed in enumerate(seeds):\n",
    "        all_acc = {}\n",
    "        pred_all = np.empty((num_test_req, len(label_cols)))\n",
    "        prob_all = np.empty((num_test_req, len(label_cols)))\n",
    "\n",
    "        print(f'Iteration {i+1} of {len(seeds)}')\n",
    "        for k, label in enumerate(label_cols):\n",
    "\n",
    "            print(f'Running {function_name}, label = {label}')\n",
    "\n",
    "            model = deepcopy(models_dict[label])\n",
    "            X_train, X_test, y_train, y_test = train_test_split(\n",
    "                X, y_all[label].values, test_size=split_size,\n",
    "                random_state=seed)\n",
    "            vectorizer = TfidfVectorizer(\n",
    "                ngram_range=(1, 2),\n",
    "                tokenizer=tokenize,\n",
    "                encoding='utf-8',\n",
    "                use_idf=True,\n",
    "                smooth_idf=True,\n",
    "                max_features=None,\n",
    "                norm='l2',\n",
    "                max_df=0.5,\n",
    "                min_df=0.01, strip_accents='unicode',\n",
    "                sublinear_tf=1\n",
    "            )\n",
    "\n",
    "            X_train = vectorizer.fit_transform(X_train)\n",
    "            X_test = vectorizer.transform(X_test)\n",
    "\n",
    "            if function_name == 'SMOTE':\n",
    "                sm = SMOTE(random_state=42)\n",
    "                X_train, y_train = sm.fit_sample(X_train, y_train)\n",
    "\n",
    "            model.fit(X_train, y_train)\n",
    "            y_pred = model.predict(X_test)\n",
    "\n",
    "            pred_all[:, k] = y_pred\n",
    "            prob_all[:, k] = model.predict_proba(X_test)[:, 1]\n",
    "\n",
    "        X_train, X_test, y_train, y_test = train_test_split(\n",
    "            X, y_all[label_cols].values, test_size=0.20, random_state=seed)\n",
    "        print(f'{function_name} \\n')\n",
    "\n",
    "        all_acc['acc'] = 100*metrics.accuracy_score(y_test, pred_all)\n",
    "        all_acc['micro'] = 100*metrics.f1_score(\n",
    "            y_test, pred_all, average='micro')\n",
    "        all_acc['macro'] = 100*metrics.f1_score(\n",
    "            y_test, pred_all, average='macro')\n",
    "        all_acc['hamming'] = 100*metrics.hamming_loss(y_test, pred_all)\n",
    "        all_acc['r_loss'] = metrics.label_ranking_loss(y_test, prob_all)\n",
    "        temp.append(all_acc)\n",
    "\n",
    "    df_final = pd.DataFrame(temp)\n",
    "\n",
    "    print(f'Metrics : \\n{df_final.mean(axis=0)}')\n",
    "    print(f'Std : \\n{df_final.std(axis=0)}')\n",
    "\n",
    "    df_final.to_csv(f'{function_name}.csv')\n",
    "\n",
    "\n",
    "def run_BR_opt_th(\n",
    "        X, y_all, label_cols,\n",
    "        function_name, opt_thresholds=opt_thresholds,\n",
    "        seeds=list(range(10)), split_size=0.2):\n",
    "\n",
    "    \"\"\"\n",
    "    Runs BR with optimized thresholds and saves results in file\n",
    "    Inputs\n",
    "        X : vectorized cleaned text of requests\n",
    "        y_all : target labels (train and test)\n",
    "        label_cols : list of labels' names\n",
    "        function_name : name of the function\n",
    "        opt_thresholds : list of optimized thresholds for each label\n",
    "        seeds : seeds to generate random numbers (optional)\n",
    "        split_size : percentage of test data (optional, default = 0.2)\n",
    "    \"\"\"\n",
    "    temp = []\n",
    "    num_test_req = ceil(split_size * y_all.shape[0])\n",
    "    for i, seed in enumerate(seeds):\n",
    "\n",
    "        all_acc = {}\n",
    "        print(f'Iteration {i+1} of {len(seeds)}')\n",
    "        print(f'Running {function_name}')\n",
    "\n",
    "        model = BinaryRelevance(LogisticRegression(\n",
    "            C=1, solver='lbfgs', max_iter=500))\n",
    "\n",
    "        X_train, X_test, y_train, y_test = train_test_split(\n",
    "            X, y_all[label_cols].values, test_size=split_size,\n",
    "            random_state=seed)\n",
    "        vectorizer = TfidfVectorizer(\n",
    "            ngram_range=(1, 2),\n",
    "            tokenizer=tokenize,\n",
    "            encoding='utf-8',\n",
    "            use_idf=True,\n",
    "            smooth_idf=True,\n",
    "            max_features=None,\n",
    "            norm='l2',\n",
    "            max_df=0.5,\n",
    "            min_df=0.01, strip_accents='unicode',\n",
    "            sublinear_tf=1\n",
    "        )\n",
    "\n",
    "        opt_list = []\n",
    "        for label in label_cols:\n",
    "            opt_list.append(opt_thresholds[label])\n",
    "\n",
    "        X_train = vectorizer.fit_transform(X_train)\n",
    "        X_test = vectorizer.transform(X_test)\n",
    "        model.fit(X_train, y_train)\n",
    "        pred_probs = model.predict_proba(X_test)\n",
    "        y_pred = (pred_probs > np.array(opt_list))*1\n",
    "\n",
    "        all_acc['acc'] = 100*metrics.accuracy_score(y_test, y_pred.A)\n",
    "        all_acc['micro'] = 100*metrics.f1_score(\n",
    "            y_test, y_pred.A, average='micro')\n",
    "        all_acc['macro'] = 100*metrics.f1_score(\n",
    "            y_test, y_pred.A, average='macro')\n",
    "        all_acc['hamming'] = 100*metrics.hamming_loss(y_test, y_pred.A)\n",
    "        all_acc['r_loss'] = metrics.label_ranking_loss(y_test, pred_probs.A)\n",
    "        temp.append(all_acc)\n",
    "\n",
    "    df_final = pd.DataFrame(temp)\n",
    "    print(f'Metrics : \\n{df_final.mean(axis=0)}')\n",
    "    print(f'Std : \\n{df_final.std(axis=0)}')\n",
    "    df_final.to_csv(f'{function_name}.csv')\n",
    "\n",
    "\n",
    "def run_multilabel(\n",
    "        X, y_all, label_cols,\n",
    "        function_name, seeds=list(range(2)), split_size=0.2):\n",
    "\n",
    "    \"\"\"\n",
    "    Runs Multilabel model according to function_name parameter\n",
    "    Saves results in file\n",
    "    Inputs\n",
    "        X : vectorized cleaned text of requests\n",
    "        y_all : target labels (train and test)\n",
    "        label_cols : list of labels' names\n",
    "        function_name : name of the function\n",
    "        seeds : seeds to generate random numbers (optional)\n",
    "        split_size : percentage of test data (optional, default = 0.2)\n",
    "    \"\"\"\n",
    "    temp = []\n",
    "    num_test_req = ceil(split_size * y_all.shape[0])\n",
    "    for i, seed in enumerate(seeds):\n",
    "\n",
    "        all_acc = {}\n",
    "        print(f'Iteration {i+1} of {len(seeds)}')\n",
    "        print(f'Running {function_name}')\n",
    "\n",
    "        if function_name == 'CC':\n",
    "            model = ClassifierChain(LogisticRegression(\n",
    "                C=1, solver='lbfgs', max_iter=500))\n",
    "        elif function_name == 'BR':\n",
    "            model = BinaryRelevance(LogisticRegression(\n",
    "                C=1, solver='lbfgs', max_iter=500))\n",
    "            \n",
    "        elif function_name == 'MLKNN':\n",
    "            model = MLkNN(k=2,s=0.5)\n",
    "\n",
    "        X_train, X_test, y_train, y_test = train_test_split(\n",
    "            X, y_all[label_cols].values, test_size=split_size,\n",
    "            random_state=seed)\n",
    "\n",
    "        vectorizer = TfidfVectorizer(\n",
    "            ngram_range=(1, 2),\n",
    "            tokenizer=tokenize,\n",
    "            encoding='utf-8',\n",
    "            use_idf=True,\n",
    "            smooth_idf=True,\n",
    "            max_features=None,\n",
    "            norm='l2',\n",
    "            max_df=0.5,\n",
    "            min_df=0.01, strip_accents='unicode',\n",
    "            sublinear_tf=1\n",
    "        )\n",
    "\n",
    "        X_train = vectorizer.fit_transform(X_train)\n",
    "        X_test = vectorizer.transform(X_test)\n",
    "        model.fit(X_train, y_train)\n",
    "        y_pred = model.predict(X_test)\n",
    "        pred_probs = model.predict_proba(X_test)\n",
    "\n",
    "        all_acc['acc'] = 100*metrics.accuracy_score(y_test, y_pred.A)\n",
    "        all_acc['micro'] = 100*metrics.f1_score(\n",
    "            y_test, y_pred.A, average='micro')\n",
    "        all_acc['macro'] = 100*metrics.f1_score(\n",
    "            y_test, y_pred.A, average='macro')\n",
    "        all_acc['hamming'] = 100*metrics.hamming_loss(y_test, y_pred.A)\n",
    "        all_acc['r_loss'] = metrics.label_ranking_loss(y_test, pred_probs.A)\n",
    "        temp.append(all_acc)\n",
    "\n",
    "    df_final = pd.DataFrame(temp)\n",
    "    print(f'Metrics : \\n{df_final.mean(axis=0)}')\n",
    "    print(f'Std : \\n{df_final.std(axis=0)}')\n",
    "    df_final.to_csv(f'{function_name}.csv')\n",
    "\n",
    "\n",
    "def run_rakel(\n",
    "        X, y_all, label_cols,\n",
    "        function_name, label_partition,\n",
    "        seeds=list(range(2)), split_size=0.2):\n",
    "    \"\"\"\n",
    "    Runs RAKEL classifier and saves results in file\n",
    "    Inputs\n",
    "        X : vectorized cleaned text of requests\n",
    "        y_all : target labels (train and test)\n",
    "        label_cols : list of labels' names\n",
    "        label_partition : number of times that the label set\n",
    "                          will be divided\n",
    "        function_name : name of the function\n",
    "        seeds : seeds to generate random numbers (optional)\n",
    "        split_size : percentage of test data (optional, default = 0.2)\n",
    "    \"\"\"\n",
    "    temp = []\n",
    "    num_test_req = ceil(split_size * y_all.shape[0])\n",
    "    for i, seed in enumerate(seeds):\n",
    "\n",
    "        all_acc = {}\n",
    "        print(f'Iteration {i+1} of {len(seeds)}')\n",
    "        print(f'Running {function_name}')\n",
    "\n",
    "        X_train, X_test, y_train, y_test = train_test_split(\n",
    "            X, y_all[label_cols].values, test_size=split_size,\n",
    "            random_state=seed)\n",
    "\n",
    "        if label_partition != 4:\n",
    "                    model = RakelD(\n",
    "                        base_classifier=LogisticRegression(max_iter=1000),\n",
    "                        base_classifier_require_dense=[True, True],\n",
    "                        labelset_size=y_train.shape[1] // int(label_partition)\n",
    "                        )\n",
    "        else:\n",
    "            model = RakelD(\n",
    "                base_classifier=LogisticRegression(C=0.25, max_iter=1000),\n",
    "                base_classifier_require_dense=[True, True],\n",
    "                labelset_size=y_train.shape[1] // int(label_partition)\n",
    "                )\n",
    "\n",
    "        vectorizer = TfidfVectorizer(\n",
    "            ngram_range=(1, 2),\n",
    "            tokenizer=tokenize,\n",
    "            encoding='utf-8',\n",
    "            use_idf=True,\n",
    "            smooth_idf=True,\n",
    "            max_features=None,\n",
    "            norm='l2',\n",
    "            max_df=0.5,\n",
    "            min_df=0.01, strip_accents='unicode',\n",
    "            sublinear_tf=1\n",
    "        )\n",
    "\n",
    "        X_train = vectorizer.fit_transform(X_train)\n",
    "        X_test = vectorizer.transform(X_test)\n",
    "        model.fit(X_train, y_train)\n",
    "        y_pred = model.predict(X_test)\n",
    "        pred_probs = model.predict_proba(X_test)\n",
    "\n",
    "        all_acc['acc'] = 100*metrics.accuracy_score(y_test, y_pred.A)\n",
    "        all_acc['micro'] = 100*metrics.f1_score(\n",
    "            y_test, y_pred.A, average='micro')\n",
    "        all_acc['macro'] = 100*metrics.f1_score(\n",
    "            y_test, y_pred.A, average='macro')\n",
    "        all_acc['hamming'] = 100*metrics.hamming_loss(y_test, y_pred.A)\n",
    "        all_acc['r_loss'] = metrics.label_ranking_loss(y_test, pred_probs.A)\n",
    "        temp.append(all_acc)\n",
    "\n",
    "    df_final = pd.DataFrame(temp)\n",
    "    print(f'Metrics : \\n{df_final.mean(axis=0)}')\n",
    "    print(f'Std : \\n{df_final.std(axis=0)}')\n",
    "    df_final.to_csv(f'{function_name}_{label_partition}.csv')\n",
    "\n",
    "\n",
    "def train_NN(\n",
    "        X, y_all, label_cols, function_name='CNN', seeds=list(range(2))):\n",
    "\n",
    "    \"\"\"\n",
    "    Runs Neural Network classifier and saves results in file\n",
    "    Inputs\n",
    "        X : cleaned text of requests\n",
    "        y_all : target labels (train and test)\n",
    "        label_cols : list of labels' names\n",
    "        function_name : name of the function\n",
    "        seeds : seeds to generate random numbers (optional)\n",
    "    \"\"\"\n",
    "    vocab_size = 10000\n",
    "    embedding_dim = 32\n",
    "    max_length = 400\n",
    "    trunc_type = 'post'\n",
    "    padding_type = 'post'\n",
    "    oov_tok = \"<OOV>\"\n",
    "\n",
    "    all_results = []\n",
    "    for seed,i in enumerate(seeds):\n",
    "        print(f'Iteration {i+1} of {len(seeds)}')\n",
    "        print(f'Running {function_name}')\n",
    "\n",
    "        X_train, X_test, y_train, y_test = train_test_split(\n",
    "                  X, y_all[label_cols].values, test_size=0.2,\n",
    "                  random_state=seed)\n",
    "\n",
    "        tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)\n",
    "        tokenizer.fit_on_texts(X_train.values.tolist())\n",
    "\n",
    "        # transforming the words into vectors\n",
    "        X_train = tokenizer.texts_to_sequences(X_train)\n",
    "        # forcing all requests to have the same length (padding)\n",
    "        training_padded = pad_sequences(\n",
    "            X_train, maxlen=max_length,\n",
    "            padding=padding_type, truncating=trunc_type)\n",
    "        X_test = tokenizer.texts_to_sequences(X_test)\n",
    "        testing_padded = pad_sequences(\n",
    "            X_test, maxlen=max_length,\n",
    "            padding=padding_type, truncating=trunc_type)\n",
    "\n",
    "        # trains CNN\n",
    "        if function_name == 'CNN':\n",
    "            model = tf.keras.Sequential([\n",
    "                tf.keras.layers.Embedding(\n",
    "                    vocab_size, embedding_dim, input_length=max_length),\n",
    "                tf.keras.layers.Conv1D(32, 3, activation='relu'),\n",
    "                tf.keras.layers.GlobalAveragePooling1D(),\n",
    "                tf.keras.layers.Flatten(),\n",
    "                tf.keras.layers.Dense(32, activation='relu'),\n",
    "                tf.keras.layers.Dropout(0.2),\n",
    "                tf.keras.layers.Dense(len(label_cols), activation='sigmoid')])\n",
    "        # trains LSTM\n",
    "        elif function_name == 'LSTM':\n",
    "            model = tf.keras.Sequential([\n",
    "                tf.keras.layers.Embedding(\n",
    "                    vocab_size, embedding_dim, input_length=max_length),\n",
    "                tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),\n",
    "                tf.keras.layers.Dense(32, activation='relu'),\n",
    "                tf.keras.layers.Dropout(0.2),\n",
    "                tf.keras.layers.Dense(len(label_cols), activation='sigmoid')])\n",
    "\n",
    "        model.compile(\n",
    "            loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
    "        num_epochs = 250\n",
    "        history = model.fit(training_padded, y_train, epochs=num_epochs,\n",
    "                            validation_data=(\n",
    "                                testing_padded, y_test), verbose=2)\n",
    "        preds = model.predict(testing_padded)\n",
    "\n",
    "        # from probabilities to 1's or 0's\n",
    "        labels_out = (preds > 0.5)*1\n",
    "\n",
    "        all_acc = {}\n",
    "\n",
    "        all_acc['acc'] = 100*metrics.accuracy_score(y_test, labels_out)\n",
    "        all_acc['micro'] = 100*metrics.f1_score(\n",
    "            y_test, labels_out, average='micro')\n",
    "        all_acc['macro'] = 100*metrics.f1_score(\n",
    "            y_test, labels_out, average='macro')\n",
    "        all_acc['hamming'] = 100*metrics.hamming_loss(y_test, labels_out)\n",
    "        all_acc['r_loss'] = metrics.label_ranking_loss(y_test, preds)\n",
    "        all_results.append(all_acc)\n",
    "\n",
    "        model.save(f\"model_lstm_{seed}.h5\")  # saves the model\n",
    "\n",
    "    df_final = pd.DataFrame(all_results)\n",
    "    print(f'Metrics : \\n{df_final.mean(axis=0)}')\n",
    "    print(f'Std : \\n{df_final.std(axis=0)}')\n",
    "    df_final.to_csv(f'{function_name}.csv')\n",
    "\n",
    "\n",
    "def run_all(X, y_all, label_cols, seeds=list(range(10))):\n",
    "    \"\"\" Runs all models described in the paper \"\"\"\n",
    "\n",
    "    print('Running all models \\n')\n",
    "\n",
    "    start = time.time()\n",
    "    function_name = 'optimized_models'\n",
    "    run_BR_opt_models(\n",
    "            X, y_all, models_dict, label_cols, function_name, seeds=seeds)\n",
    "    end = time.time()\n",
    "    print(f'Avg. time for {function_name}: {(end - start)/len(seeds)}')\n",
    "    start = time.time()\n",
    "    function_name = 'SMOTE'\n",
    "    run_BR_opt_models(\n",
    "            X, y_all, models_dict, label_cols, function_name, seeds=seeds)\n",
    "\n",
    "    start = time.time()\n",
    "    function_name = 'optimized_thresholds'\n",
    "    run_BR_opt_th(X, y_all, label_cols, function_name)\n",
    "    end = time.time()\n",
    "    print(f'Avg. time for {function_name}: {(end - start)/len(seeds)}')\n",
    "\n",
    "    start = time.time()\n",
    "    function_name = 'CC'\n",
    "    run_multilabel(X, y_all, label_cols, function_name, seeds=seeds)\n",
    "    end = time.time()\n",
    "    print(f'Avg. time for {function_name}: {(end - start)/len(seeds):.2f}')\n",
    "    start = time.time()\n",
    "    function_name = 'BR'\n",
    "    run_multilabel(X, y_all, label_cols, function_name, seeds=seeds)\n",
    "    end = time.time()\n",
    "    print(f'Avg. time for {function_name}: {(end - start)/len(seeds):.2f}')\n",
    "    start = time.time()\n",
    "    function_name = 'MLKNN'\n",
    "    run_multilabel(X, y_all, label_cols, function_name, seeds=seeds)\n",
    "    end = time.time()\n",
    "    print(f'Avg. time for {function_name}: {(end - start)/len(seeds):.2f}')\n",
    "    start = time.time()\n",
    "    function_name = 'rakel'\n",
    "    run_rakel(X, y_all, label_cols, function_name, 4, seeds=seeds)\n",
    "    end = time.time()\n",
    "    print(f'Avg. time for {function_name}: {(end - start)/len(seeds):.2f}')\n",
    "    start = time.time()\n",
    "    run_rakel(X, y_all, label_cols, function_name, 2, seeds=seeds)\n",
    "    end = time.time()\n",
    "    print(f'Avg. time for {function_name} {(end - start)/len(seeds):.2f}')\n",
    "    start = time.time()\n",
    "    run_rakel(X, y_all, label_cols, function_name, 1, seeds=seeds)\n",
    "    end = time.time()\n",
    "    print(f'Avg. time for {function_name}: {(end - start)/len(seeds):.2f}')\n",
    "    start = time.time()\n",
    "\n",
    "    function_name = 'ECC'\n",
    "    run_ECC(X, y_all, label_cols, function_name, seeds=seeds)\n",
    "    end = time.time()\n",
    "    print(f'Avg. time for {function_name}: {(end - start)/len(seeds):.2f}')\n",
    "    start = time.time()\n",
    "\n",
    "    function_name = 'CNN'\n",
    "    train_NN(X, y_all, label_cols, function_name, seeds=seeds)\n",
    "    end = time.time()\n",
    "    print(f'Avg. time for {function_name}: {(end - start)/len(seeds):.2f}')\n",
    "    start = time.time()\n",
    "\n",
    "    function_name = 'LSTM'\n",
    "    train_NN(X, y_all, label_cols, function_name, seeds=seeds)\n",
    "    end = time.time()\n",
    "    print(f'Avg. time for {function_name}: {(end - start)/len(seeds):.2f}')\n",
    "\n",
    "\n",
    "def main(argv, seeds=list(range(1))):\n",
    "\n",
    "    X = pd.read_csv('text.csv')\n",
    "    X = X.Clean_Text\n",
    "    function_name = 'all'\n",
    "    y_all = pd.read_csv(('labels.csv'))\n",
    "    label_cols = [name for name in models_dict]\n",
    "\n",
    "    print(f'Running classifers for labels: \\n{ label_cols}')\n",
    "    if function_name == 'all':\n",
    "        run_all(X, y_all, label_cols, seeds=seeds)\n",
    "    elif function_name == 'optimized_thresholds':\n",
    "        run_BR_opt_th(X, y_all, label_cols, function_name, seeds=seeds)\n",
    "    elif function_name == 'optimized_models':\n",
    "        run_BR_opt_models(\n",
    "                X, y_all, models_dict, label_cols, function_name, seeds=seeds)\n",
    "    elif function_name == 'SMOTE':\n",
    "        run_BR_opt_models(\n",
    "                X, y_all, models_dict, label_cols, function_name, seeds=seeds)\n",
    "    elif function_name == 'CC':\n",
    "        run_multilabel(X, y_all, label_cols, function_name, seeds=seeds)   \n",
    "    elif function_name == 'BR':\n",
    "        run_multilabel(X, y_all, label_cols, function_name, seeds=seeds)\n",
    "    elif function_name == 'MLKNN':\n",
    "        run_multilabel(X, y_all, label_cols, function_name, seeds=seeds)\n",
    "    elif function_name == 'rakel_4':\n",
    "        run_rakel(X, y_all, label_cols, function_name, 4, seeds=seeds)\n",
    "    elif function_name == 'rakel_2':\n",
    "        run_rakel(X, y_all, label_cols, function_name, 2, seeds=seeds)\n",
    "    elif function_name == 'LP':\n",
    "        run_rakel(X, y_all, label_cols, function_name, 1, seeds=seeds)\n",
    "    elif function_name == 'ECC':\n",
    "        run_ECC(X, y_all, label_cols, function_name, seeds=seeds)\n",
    "    elif function_name == 'LSTM' or function_name == 'CNN':\n",
    "        train_NN(X, y_all, label_cols, function_name, seeds=seeds)\n",
    "    else:\n",
    "        print('Please enter one of the following methods:\\n  all,optimized_thresholds,\\\n",
    "            optimized_models, SMOTE, BR, rakel_4, rakel_2, \\\n",
    "            LSTM, CNN, LP, CC or ECC')\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    print(f'Running: {sys.argv[1]}')\n",
    "    main([0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "MvDgkDkJWCKO"
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "name": "Untitled",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
